import os

from docx import Document

from re_utils import re_anonymize_all


def read_word(word_path):
    doc = Document(word_path)
    print("文件名： ", word_path)
    all_text = ''
    for para in doc.paragraphs:
        all_text = all_text + para.text
        re_text = re_anonymize_all(para.text)
        if re_text != para.text:
            print('存在敏感信息')
            print(para.text)
    re_all_text = re_anonymize_all(all_text)
    if re_all_text != all_text:
        print('总体存在敏感信息')



if __name__ == '__main__':
    folder_path = r'D:\工作\IT公司RAG打榜比赛营销相关数据需求\word'
    # files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    print(os.listdir(folder_path))
    files = os.listdir(folder_path)
    for f in files:
        read_word(os.path.join(folder_path, f))

